In [1]:
import requests
import pandas as pd
import plotly.express as px

# Set up Genius API credentials
access_token = "-fy2dpbY6GKJ93XPpsNRLz-2KJGwFYgOqVjxf-qet5ebM0X5cU-HOXKkNvvjAQqF"
base_url = "https://api.genius.com"
artist_name = 'J. Cole'

# Build the search URL for the artist
search_url = f'{base_url}/search?q={artist_name}'

# Set up headers with Genius API token
header = {
    'Authorization': f'Bearer {access_token}'
}

# Make a request to the Genius API
response = requests.get(search_url, headers=header)
data = response.json()

# Check if artist information is available
if 'hits' in data['response'] and data['response']['hits']:
    # Access artist information
    artist_id = data['response']['hits'][0]['result']['primary_artist']['id']

    # Initialize variables for pagination, Genius only allows 20 songs at once
    per_page = 20
    page = 1
    all_artist_songs = []

    # Fetch all songs for the artist using pagination
    while True:
        artist_songs_url = f'{base_url}/artists/{artist_id}/songs?per_page={per_page}&page={page}'

        response = requests.get(artist_songs_url, headers=header)
        songs_data = response.json()['response']['songs']

        if not songs_data:
            break  # No more songs, exit the loop

        # Append songs to the list
        all_artist_songs.extend(songs_data)

        page += 1

    # Remove songs without 'pageviews' field
    all_artist_songs = [song for song in all_artist_songs if 'stats' in song and 'pageviews' in song['stats']]

    # Extract data for plotting
    song_titles = [song['title'] for song in all_artist_songs]
    view_counts = [song['stats']['pageviews'] for song in all_artist_songs]
    release_dates = [song['release_date_components'] for song in all_artist_songs]

    # Create a DataFrame from the extracted data
    df = pd.DataFrame({
        'Song Title': song_titles,
        'View Count': view_counts,
        'Release Date': release_dates
    })

    # Convert release dates to a readable format 
    df['Release Date'] = pd.to_datetime(df['Release Date'].apply(lambda x: f"{x['year']}-{x['month']}-{x['day']}" if x is not None else None), errors='coerce')

    # Create an interactive scatter plot
    fig = px.scatter(df, x='Release Date', y='View Count', text='Song Title', title=f'View Count for {artist_name}')
    
    # Customize plot details
    fig.update_traces(textposition='top center', texttemplate='%{text}', hovertemplate='%{text}<br>Release Date: %{x}<br>View Count: %{y}', mode='markers')

    fig.show()

else:
    print(f"Artist '{artist_name}' not found.")

# Display the DataFrame
print(df)
          Song Title  View Count Release Date
0    03' Adolescence     1359521   2014-12-09
1    1 0 0 . m i l ’      379991   2021-05-14
2     1-888-88-DREAM       15445   2014-01-28
3               1985     1668465   2018-04-20
4               1993      324838   2019-07-05
..               ...         ...          ...
438         Work Out     1306066   2011-06-15
439   World is Empty       84477   2009-06-15
440       You Got It      193085   2010-11-12
441       Your Heart      293577   2021-09-24
442          Zendaya      189916   2018-02-13

[443 rows x 3 columns]

Based on the graph above, it appears that J. Cole's standout year was 2014. Despite releasing a substantial amount of music in the years between 2010 and 2013, the view counts for each were subpar compared to 2014. Notably, this was the year of one of his most iconic albums, "2014 Forest Hills Drive."

With that context, in the chunk below, I will conduct further regression analysis on all songs released in 2014. The objective is to explore if valence, the measure of how positive a song sounds, played a role in their popularity. Given that the Genius API lacks such data, I will utilize the Spotify API to access valence information. This analysis aims to unveil insights into the relationship between valence and view counts, and wether they contributed to Jcoles most succesful year

In [2]:
# Set up Genius API credentials (50 daily limit)
client_id = "1e6f9168e7a14e6fa864b7417bf97416"
client_secret = "736e86b1c08a40b88c967e3392ee57e9"

# Spotify API endpoint for obtaining an access token
token_url = "https://accounts.spotify.com/api/token"

# Set up the headers and payload for the token request
headers = {
    "Content-Type": "application/x-www-form-urlencoded",
}

payload = {
    "grant_type": "client_credentials",
    "client_id": client_id,
    "client_secret": client_secret,
}

response = requests.post(token_url, headers=headers, data=payload)
token_data = response.json()


# Check if the access token was obtained successfully
if "access_token" in token_data:
    access_token = token_data["access_token"]
    track_cache = {}  # Memoization cache

    def get_track_info(track_id):
        # Check if track information is already in the cache
        if track_id in track_cache:
            return track_cache[track_id]

        # Make API call to get track information
        track_url = f"https://api.spotify.com/v1/audio-features/{track_id}"
        track_response = requests.get(track_url, headers=headers)
        track_data = track_response.json()

        # Check if the track information was retrieved successfully
        if "valence" in track_data:
            # Cache the track information
            track_cache[track_id] = track_data
            return track_data
        else:
            print(f"Unable to retrieve valence for track with ID {track_id}")
            return None

    # Get all tracks for Jcole, artist_id obtained online
    artist_id = "6l3HvQ5sa6mXTsMTB19rO5"  # J. Cole's artist ID
    artist_url = f"https://api.spotify.com/v1/artists/{artist_id}/albums"
    headers = {
        "Authorization": f"Bearer {access_token}",
    }

    artist_response = requests.get(artist_url, headers=headers)
    artist_data = artist_response.json()

    # Check if the artist information was retrieved successfully
    if "items" in artist_data:
        all_tracks = []

        # Iterate through each album and get its tracks
        for album in artist_data["items"]:
            album_id = album["id"]
            album_url = f"https://api.spotify.com/v1/albums/{album_id}/tracks"

            album_response = requests.get(album_url, headers=headers)
            album_tracks = album_response.json()["items"]

            # Add the tracks to the list
            all_tracks.extend(album_tracks)

        # Create a DataFrame to store the data
        df_data = {'Track Name': [], 'Valence': []}

        # Populate the DataFrame with track information
        for track in all_tracks:
            track_id = track["id"]
            track_info = get_track_info(track_id)

            if track_info:
                # Append data to the DataFrame
                df_data['Track Name'].append(track['name'])
                df_data['Valence'].append(track_info['valence'])

        # Create a DataFrame from the collected data
        spotify_df = pd.DataFrame(df_data)

        # Print the DataFrame
        print(spotify_df)

    else:
        print("Unable to retrieve artist information.")
else:
    print("Failed to obtain access token.") 
                                            Track Name  Valence
0    Stick (with JID & J. Cole feat. Kenny Mason & ...    0.597
1    Ghetto Gods Freestyle (with EARTHGANG feat. 2 ...    0.584
2                 Lifestyle (with Bas feat. A$AP Ferg)    0.586
3                  Starting 5 (with Lute, Cozz & Omen)    0.713
4                        Coming Down (with Ari Lennox)    0.622
..                                                 ...      ...
154                            90 Proof (with J. Cole)    0.528
155                             LONDON (feat. J. Cole)    0.563
156      Scared Money (feat. J. Cole and Moneybagg Yo)    0.661
157                                   Johnny P's Caddy    0.581
158                        Poke It Out (feat. J. Cole)    0.803

[159 rows x 2 columns]

In the above Chunck, I created a datatable containing the song name and Valence from the Spotify API.

In [3]:
if 'hits' in data['response'] and data['response']['hits']:
    # Access artist information
    artist_id = data['response']['hits'][0]['result']['primary_artist']['id']

    # Initialize variables for pagination, Genius only allows 20 songs at once
    per_page = 20
    page = 1
    all_artist_songs = []

    while True:
        genius_artist_songs_url = f'{base_url}/artists/{artist_id}/songs?per_page={per_page}&page={page}'
        genius_response = requests.get(genius_artist_songs_url, headers=header)
        genius_songs_data = genius_response.json()['response']['songs']

        if not genius_songs_data:
            break  # No more songs, exit the loop

        # Append songs to the list
        all_artist_songs.extend(genius_songs_data)
        page += 1

    # Remove songs without 'pageviews' field
    all_artist_songs = [song for song in all_artist_songs if 'stats' in song and 'pageviews' in song['stats']]

    # Extract data for DataFrame
    song_titles = [song['title'] for song in all_artist_songs]
    view_counts = [song['stats']['pageviews'] for song in all_artist_songs]
    release_dates = [song['release_date_components'] for song in all_artist_songs]

    # Create DataFrame
    genius_df = pd.DataFrame({
        'Song Title': song_titles,
        'View Count': view_counts,
        'Release Date': release_dates
    })

    # Convert release dates to a readable format
    genius_df['Release Date'] = pd.to_datetime(genius_df['Release Date'].apply(lambda x: f"{x['year']}-{x['month']}-{x['day']}" if x is not None else None), errors='coerce')

    # Print the DataFrame
    print(genius_df)

else:
    print(f"Artist '{artist_name}' not found.")
          Song Title  View Count Release Date
0    03' Adolescence     1359521   2014-12-09
1    1 0 0 . m i l ’      379991   2021-05-14
2     1-888-88-DREAM       15445   2014-01-28
3               1985     1668465   2018-04-20
4               1993      324838   2019-07-05
..               ...         ...          ...
438         Work Out     1306066   2011-06-15
439   World is Empty       84477   2009-06-15
440       You Got It      193085   2010-11-12
441       Your Heart      293577   2021-09-24
442          Zendaya      189916   2018-02-13

[443 rows x 3 columns]

In the code chunck above, I have also extracted the song title, the view count and the release Date from the Geniues API. Now that these two dataframes have a column in column, I will be merging them using the song title column and preprocess the resulting data.

In [4]:
#Data cleaning
spotify_df.rename(columns={'Track Name': 'Song Title'}, inplace=True)

# Remove non-alphanumeric characters and multiple whitespaces for Spotify dataframe
spotify_df['Song Title'] = spotify_df['Song Title'].str.lower().str.replace(r'\W+', '').str.strip()

# Remove non-alphanumeric characters and multiple whitespaces for Genius dataframe
genius_df['Song Title'] = genius_df['Song Title'].str.lower().str.replace(r'\W+', '').str.strip()


print(spotify_df)
print(genius_df)
                                      Song Title  Valence
0        stickwithjidjcolefeatkennymasonsheckwes    0.597
1    ghettogodsfreestylewithearthgangfeat2chainz    0.584
2                    lifestylewithbasfeataapferg    0.586
3                      starting5withlutecozzomen    0.713
4                        comingdownwitharilennox    0.622
..                                           ...      ...
154                             90proofwithjcole    0.528
155                              londonfeatjcole    0.563
156           scaredmoneyfeatjcoleandmoneybaggyo    0.661
157                                johnnypscaddy    0.581
158                           pokeitoutfeatjcole    0.803

[159 rows x 2 columns]
        Song Title  View Count Release Date
0    03adolescence     1359521   2014-12-09
1           100mil      379991   2021-05-14
2      188888dream       15445   2014-01-28
3             1985     1668465   2018-04-20
4             1993      324838   2019-07-05
..             ...         ...          ...
438        workout     1306066   2011-06-15
439   worldisempty       84477   2009-06-15
440       yougotit      193085   2010-11-12
441      yourheart      293577   2021-09-24
442        zendaya      189916   2018-02-13

[443 rows x 3 columns]
C:\Users\rageg\AppData\Local\Temp\ipykernel_29532\1252738554.py:5: FutureWarning:

The default value of regex will change from True to False in a future version.

C:\Users\rageg\AppData\Local\Temp\ipykernel_29532\1252738554.py:8: FutureWarning:

The default value of regex will change from True to False in a future version.

In [5]:
#Merging the two datasets on Song Title
merged_df = pd.merge(spotify_df, genius_df, on='Song Title', how='inner')
#print(merged_df)

#Removing Duplicate songs.
merged_df = merged_df.drop_duplicates(subset="Song Title", keep="first")
print(merged_df)
          Song Title  Valence  View Count Release Date
0            95south    0.203      790139   2021-05-14
1              amari    0.207      845154   2021-05-14
2   applyingpressure    0.404      503909   2021-05-14
3    punchintheclock    0.692      370470   2021-05-14
4          interlude    0.104      894811   2021-05-07
..               ...      ...         ...          ...
72          godsgift    0.439      182831   2011-09-27
73         breakdown    0.489      226802   2011-09-11
74           workout    0.216     1306066   2011-06-15
75   thesecretrecipe    0.690      174949   2023-09-28
76     johnnypscaddy    0.581      350214   2022-01-28

[72 rows x 4 columns]

I have successfully merged the two datasets around common song. To do this, I made every value under the song title column lowercase, removed all spaces, and compared the equality of their value for the merging. This ensured that all of the songs that have been merged are indeed the same version. Those that could not be merged correctly, were ignored for the purpose of accuracy. Below I will plot the Valence vs Viewcount of all of Jcoles 2014 songs to see if their level of happiness at the time, impacted their viewcout.

In [6]:
#Filtering for songs in 2014
filtered_df = merged_df[merged_df['Release Date'].dt.year == 2014]
print(filtered_df)

fig = px.scatter(filtered_df, x='Valence', y='View Count', text='Song Title', title='Valence vs View Count for Songs Released in 2014',
                 labels={'Valence': 'Valence', 'View Count': 'View Count'})
fig.update_traces(textposition='top center', texttemplate='%{text}', hovertemplate='%{text}<br>Valence: %{x}<br>View Count: %{y}', mode='markers')
fig.show()

#Ppringt coorelatoin coefficient for analysis
correlation_coefficient = filtered_df['Valence'].corr(filtered_df['View Count'])
print(f"Correlation Coefficient: {correlation_coefficient}")
        Song Title  Valence  View Count Release Date
34     january28th    0.342     1203207   2014-12-09
35       wetdreamz    0.539     3435206   2014-12-09
36   03adolescence    0.260     1359521   2014-12-09
37  ataleof2citiez    0.343     1614927   2014-12-09
38       firesquad    0.574     1790773   2014-12-09
39        sttropez    0.317      433841   2014-12-09
40            gomd    0.336     2116386   2014-12-09
41    norolemodelz    0.494     5893124   2014-12-09
42           hello    0.318      590343   2014-12-09
43      apparently    0.570     1968202   2014-12-09
44       loveyourz    0.435     2410896   2014-12-09
45      notetoself    0.333      580015   2014-12-09
Correlation Coefficient: 0.5577323811487858

From the graph above it appears theres little to no coorelation between View Count and Valence for the songs released in 2014. But as the correlation coefficient of 0.56 suggests, there seems to some correlation between the valence and the view count of the songs released in 2014. This suggesting with higher Valence, there seems to be somewhat of a higher view count in general. Let us see if this remains true in the next plot in valence vs Viewcount of all of the songs extracted.

In [7]:
fig = px.scatter(merged_df, x='Valence', y='View Count', text='Song Title', title='Valence vs View Count')
fig.update_traces(textposition='top center', texttemplate='%{text}', hovertemplate='%{text}<br>Valence: %{x}<br>View Count: %{y}', mode='markers')


fig.show()

Based on the plot above, it can be observed that there can be little to no coorelation between how happy a soung sounds(valence) and how many views it gets in regard to the artist J.Cole. To further prove this I have extracted the correlation coefficient below.

In [8]:
correlation = merged_df['Valence'].corr(merged_df['View Count'])
print(f"Correlation between Valence and View Count: {correlation}")
Correlation between Valence and View Count: -0.03327032911909083

As previously observed, there is not much coorelation between valence and view count for the songs that was able to pass the preprocessing.

Based on the analysis of J. Cole's songs, it appears that there is a moderate correlation between the valence (positivity) of his songs and their view count for the year 2014. However, when considering all of his songs, the correlation is less pronounced, suggesting that factors other than valence may contribute to the overall popularity of his music. Further investigation and analysis may be needed to uncover additional insights into the factors influencing the popularity of J. Cole's songs across different periods.